library(ggplot2)
library(lme4)
library(texreg)

## setwd to directory containing data (.csv) file
setwd("/Users/<path>/R_work")


d <- read.csv("combinedData_individual.csv")

d$notmale <- as.numeric(d$gender != "male")
d$notmale[d$gender == "unknown"] <- NA
summary(d$notmale)

d$notwhite <- as.numeric(d$ethnicity != "white")
d$notwhite[d$ethnicity == "unknown"] <- NA
summary(d$notwhite)

d$continued <- as.numeric(d$outcome == "yes")
d$continued[d$outcome == "unknown"] <- NA
summary(d$continued)

d$interest <- as.numeric(d$codingInterest %in% c("more", "low"))
d$interest[d$codingInterest == "unknown"] <- NA
summary(d$interest)

## create variable for event regions
## (combine event ID (Date) and region ID (Location) to create unique identifiers)
## Note: An "event" refers to all meetings held on a given day, nationwide.
d$eventregion <- as.factor(paste(d$newEventId, d$newRegionId, sep="-"))
d$eventregion.num <- as.numeric(d$eventregion)
table(d$eventregion.num)

# Data Summaries - Frequencies

xtabs(~continued + notmale, data = d)

xtabs(~continued + notwhite, data = d)

xtabs(~continued + interest, data = d)

#*****************************************************************
#
# Logistical regression for H1a (gender and continued engagement)
#
#*****************************************************************

summary(glmer(continued ~ notmale + (1|eventregion), data = d, family = binomial(logit)))

summary(glmer(continued ~ notmale + interest + (1|eventregion), data = d, family = binomial(logit)))

#*******************************************************************
#
# Logistical regression for H1b (ethnicity and continued engagement)
#
#*******************************************************************

summary(glmer(continued ~ notwhite + (1|eventregion), data = d, family = binomial(logit)))

summary(glmer(continued ~ notwhite + interest + (1|eventregion), data = d, family = binomial(logit)))

#*******************************************************************
#
# combination models 
#
#*******************************************************************

model.h1.combo <- glmer(continued ~ notmale + notwhite + interest + (1|eventregion), data = d, family = binomial(logit))
summary(model.h1.combo)

model.h1.gen <- glmer(continued ~ notmale + interest + (1|eventregion), data = d, family = binomial(logit))
summary(model.h1.gen)

model.h1.eth <- glmer(continued ~ notwhite + interest + (1|eventregion), data = d, family = binomial(logit))
summary(model.h1.eth)

#---------------------------------------------------------------
#
# Summary for H1 (underrepresentation and continued engagement)
#
# In all logistical regression models used for H1, underrepresentation
# for either gender or ethnicity alone did not have a significant 
# association with the outcome of continued engagement.
# (Although the results did agree with our predictions of lower future
# engagement if a member of an underrepresented group.)
#
# However, for both H1a (Gender) and H1b (Ethnicity), when a control
# for prior level of coding interest was added, this variable WAS found
# to have a significant association with the outcome -->
#
# Having even a low interest in coding prior to attending an event 
# is associated with a greater likelihood of continuing to engage in coding 
# after the event (compared to having no prior interest in coding.)
#
#---------------------------------------------------------------



#*******************************************************************
#
# Logistical regression for H2 
# (interaction of gender and ethnicity vs. continued engagement)
#
#*******************************************************************

summary(glmer(continued ~ gender_pct + ethnicity_pct + (1|eventregion), data = d, family=binomial("logit")))

summary(glmer(continued ~ gender_pct + ethnicity_pct + interest + (1|eventregion), data = d, family=binomial("logit")))

## add interaction terms for gender_pct * ethnicity_pct (collinear problems)
summary(glmer(continued ~ gender_pct + ethnicity_pct + gender_pct * ethnicity_pct + (1|eventregion), data = d, family=binomial("logit")))

summary(glmer(continued ~ gender_pct + ethnicity_pct + gender_pct * ethnicity_pct + interest + (1|eventregion), data = d, family=binomial("logit")))

model.h2.gen.combo <- glmer(continued ~ gender_pct + interest + (1|eventregion), data = d, family=binomial("logit"))
summary(model.h2.gen.combo)

## let's look at gender_pct within male and notmale
model.h2.gen.notmale <- glmer(continued ~ gender_pct + interest + (1|eventregion), data = d[as.logical(d$notmale),], family=binomial("logit"))
summary(model.h2.gen.notmale)
   
model.h2.gen.male <- glmer(continued ~ gender_pct + interest + (1|eventregion), data = d[!as.logical(d$notmale),], family=binomial("logit"))
summary(model.h2.gen.male)

## do the whole thing in a  single model
summary(glmer(continued ~ notmale * gender_pct + interest + (1|eventregion), data = d, family=binomial("logit")))

## look at the same thing with but based on ethnicity
## lets look at gender_pct within male and notmale
summary(glmer(continued ~ ethnicity_pct + interest + (1|eventregion), data = d[as.logical(d$notmale),], family=binomial("logit")))

summary(glmer(continued ~ ethnicity_pct + interest + (1|eventregion), data = d[!as.logical(d$notmale),], family=binomial("logit")))

#######################################


## similar analysis stratified but by ethnicity
model.h2.eth.combo <- glmer(continued ~ ethnicity_pct + interest + (1|eventregion), data = d, family=binomial("logit"))
summary(model.h2.eth.combo)

model.h2.eth.notwhite <- glmer(continued ~ ethnicity_pct + interest + (1|eventregion), data = d[as.logical(d$notwhite),], family=binomial("logit"))
summary(model.h2.eth.notwhite)

model.h2.eth.white <- glmer(continued ~ ethnicity_pct + interest + (1|eventregion), data = d[!as.logical(d$notwhite),], family=binomial("logit"))
summary(model.h2.eth.white)

## colinearity
summary(glmer(continued ~ notwhite * ethnicity_pct + interest + (1|eventregion), data = d, family=binomial("logit")))

#---------------------------------------------------------------
#
# Summary for H2 
# (interaction of gender & ethinicity and continued engagement)
#
# The results from this R analysis seem to indicate that while being
# non-male or non-white are both **individually** associated with 
# lower likelihood of continuing with coding, if a participant is BOTH
# non-male AND non-white, they are MORE likely to continue engaging.
# However, all the above effects fails to reach the level of statistical
# significance.
#
# Again, when the control for prior level of coding interest (low vs no)
# is added, this variable is found to be strongly significant, with even
# low-expressed prior interest being associated with a greater likelihood 
# of continued engagement over expressing no prior interest in coding.
#---------------------------------------------------------------

#*******************************************************************
#
# Logistical regression for H3 
# (presentation vs. continued engagement)
#
#*******************************************************************

# Make new "presented" variable where 1 = yes and 0 = no
d$presented.new <- as.numeric(d$presented == "yes")
d$presented.new[d$presented == "unknown"] <- NA

# Frequencies for Presented
table(d$presented.new)

# Frequencies for Presented - For Students with Complete Data
# This looks only at students who provided all voluntary data and returned all surveys.
table(d$presented.new[complete.cases(d)])

## version w/o random effects
summary(glm(continued ~ presented.new, data = d, family=binomial("logit")))

## WITH RANDOM EFFECTS (from specific event)

## presented
summary(glmer(continued ~ presented.new + (1|eventregion), data = d, family=binomial("logit")))

## presented & interest
summary(glmer(continued ~ presented.new + interest + (1|eventregion), data = d, family=binomial("logit")))

## look at gender
summary(glmer(continued ~ notmale + presented.new + notmale * presented.new + interest + (1|eventregion), data = d, family=binomial("logit")))

model.h3.gender <- glmer(continued ~ notmale + presented.new + notmale * presented.new + gender_pct + interest + (1|eventregion), data = d, family=binomial("logit"))
summary(model.h3.gender)

## look at ethnicity
summary(glmer(continued ~ notwhite + presented.new + notwhite * presented.new + interest + (1|eventregion), data = d, family=binomial("logit")))

model.h3.eth  <- glmer(continued ~ notwhite + presented.new + notwhite * presented.new + ethnicity_pct + interest + (1|eventregion), data = d, family=binomial("logit"))
summary(model.h3.eth)

## REPEATING WITH DIFFERENT OPTIMIZER SPECIFICATION:
model.h3.eth  <- glmer(continued ~ notwhite + presented.new + notwhite * presented.new + ethnicity_pct + interest + (1|eventregion), data = d, family=binomial("logit"), control=glmerControl(optimizer="bobyqa",optCtrl=list(maxfun=2e5)))
summary(model.h3.eth)

## adding ethnicity_pct adds very little
summary(glmer(continued ~ notwhite + presented.new + notwhite * presented.new + ethnicity_pct + notwhite * ethnicity_pct + interest + (1|eventregion), data = d, family=binomial("logit")))

## stratified version of the two models for gender
summary(glmer(continued ~ presented.new + gender_pct + interest + (1|eventregion), data = d[as.logical(d$notmale),], family=binomial("logit")))

summary(glmer(continued ~ presented.new + gender_pct + interest + (1|eventregion), data = d[!as.logical(d$notmale),], family=binomial("logit")))

## interaction terms

summary(glmer(continued ~ presented.new + gender_pct + presented.new * gender_pct +  interest + (1|eventregion), data = d[as.logical(d$notmale),], family=binomial("logit")))

summary(glmer(continued ~ presented.new + gender_pct + presented.new * gender_pct + interest + (1|eventregion), data = d[!as.logical(d$notmale),], family=binomial("logit")))

## stratified version of the two models for ethnicity
summary(glmer(continued ~  presented.new + ethnicity_pct + interest + (1|eventregion), data = d[as.logical(d$notwhite),], family=binomial("logit")))

summary(glmer(continued ~ presented.new + ethnicity_pct + interest + (1|eventregion), data = d[!as.logical(d$notwhite),], family=binomial("logit")))

## add interaction
summary(glmer(continued ~  presented.new + ethnicity_pct + interest + presented.new * ethnicity_pct + (1|eventregion), data = d[as.logical(d$notwhite),], family=binomial("logit")))

summary(glmer(continued ~ presented.new + ethnicity_pct + interest  + presented.new * ethnicity_pct + (1|eventregion), data = d[!as.logical(d$notwhite),], family=binomial("logit")))

#---------------------------------------------------------------
#
# Summary for H3 
# (presentation and continued engagement)
#
# This analysis supports our prediction that presenting is associated
# with increased likelihood of continued engagement. Even when the control
# for prior level of interest in coding is added, presenting still has 
# a significant positive effect on continued engagement.
#
#---------------------------------------------------------------

# NOTE: In no case did adding the control for interest change the
# sign or the significance of any of our variables of interest.

screenreg(list(model.h2.eth.combo,
               model.h2.eth.notwhite,
               model.h2.eth.white))

texreg(list(model.h2.eth.combo,
            model.h2.eth.notwhite,
            model.h2.eth.white))

# =====================================================================

screenreg(list(model.h2.gen.combo,
               model.h2.gen.notmale,
               model.h2.gen.male))

texreg(list(model.h2.gen.combo,
            model.h2.gen.notmale,
            model.h2.gen.male))

# =====================================================================

screenreg(list(model.h1.gen,
               model.h3.gender))

texreg(list(model.h1.gen,
            model.h3.gender))

# ========================================================

screenreg(list(model.h1.eth,
               model.h3.eth))

texreg(list(model.h1.eth,
               model.h3.eth))

# ========================================================

## predicted values for prototypical individuals
## https://stackoverflow.com/questions/39683369/change-order-of-one-set-of-factors-plotted-in-ggplot2

# determine the 10th and 90th percentiles for our actual ethnicity percent values
d_nounk <- d[d$ethnicity != "unknown",]
quants.eth_pcts <- quantile(d_nounk$ethnicity_pct, probs=c(0.1, 0.9))

# Use these as lower and upper limits of a sequence of length 200
eth_pct_range <- seq(quants.eth_pcts[1], quants.eth_pcts[2], length.out=200)

# Create a dataframe with 800 rows (for each given eth-pct value,
# there will be different combinations of ethnicity (white = 0, not white = 1) 
# and presented status (presented = 1, not presented = 0))

# We want to be able to plot 4 prediction lines -- using our model for H3.
# Use average attendee interest value (from actual data) for all hypothetical attendees.
# Assume they are all hypothetical attendees at a given value of x are 
# attending the same event (Same hypothetical Event ID and Region ID)
# referred to in this dataframe as eventregion "1-0".

# Plot 1: Proportion continued (y axis) vs Proportion Attendees of the Same Ethnicity
# Subplot 1a: Hypothetical Non-White Attendees, Subplot 1b: Hypothetical White Attendees
# Each subplot has 2 lines plotted -- one where attendees presented, 
# and one where attendees did not present. (See Figure 3 in Paper.)

proto.x.eth_pct <- data.frame(
    ethnicity_pct=rep(eth_pct_range, 4),
    notwhite=c(rep(0, 400), rep(1, 400)),
    Ethnicity=c(rep('White', 400), rep('Not White', 400)),
    presented.new=c(rep(0, 200), rep(1, 200),
                    rep(0, 200), rep(1, 200)),
    interest=mean(complete.cases(d$interest)),
    eventregion="1-0",
    individual=c(rep(0, 200),
                 rep(1, 200),
                 rep(2, 200),
                 rep(3, 200))
)

grid.tmp <- proto.x.eth_pct
grid.tmp$continued <- predict(model.h3.eth,
                              newdata=proto.x.eth_pct,
                              type="response")
test = grid.tmp$Ethnicity

grid.tmp$Ethnicity = factor(grid.tmp$Ethnicity, levels = c("Not White", "White"))

ggplot(data=grid.tmp) + aes(x=ethnicity_pct, y=continued, group=individual,
                            color=as.factor(presented.new)) +
    scale_color_discrete("Presented", breaks=c(0,1), labels=c("False", "True")) +
    scale_y_continuous("Proportion Continued") +
    scale_x_continuous("Proportion Attendees of Same Ethnicity") +
    facet_grid(.~Ethnicity) +
    geom_line() +
    geom_vline(xintercept = 0.2, linetype="dotted", color="black", size=1.0)
    
# Added vertical lines at 0.2 along the x-axes, to highlight the example of 
# predicted persistence with computing ("continued") if a participant attends 
# an event where 20% of the other attendees share the same ethnicity as that 
# participant.



## predicted values for prototypical individuals

# Follow a similar process using Gender, instead of Ethnicity.
# (See Figure 2 in Paper.)

quants.gen_pcts <- quantile(d_nounk$gender_pct, probs=c(0.1, 0.9))
gender_pct_range <- seq(quants.gen_pcts[1], quants.gen_pcts[2], length.out=200)

proto.x.gen_pct <- data.frame(
    gender_pct=rep(gender_pct_range, 4),
    notmale=c(rep(0, 400), rep(1, 400)),
    Gender=c(rep('Male', 400), rep('Not Male', 400)),
    presented.new=c(rep(0, 200), rep(1, 200),
                    rep(0, 200), rep(1, 200)),
    interest=mean(complete.cases(d$interest)),
    eventregion="1-0",
    individual=c(rep(0, 200),
                 rep(1, 200),
                 rep(2, 200),
                 rep(3, 200))
)

proto.x.gen_pct$continued <- predict(model.h3.gender,
                                    newdata=proto.x.gen_pct,
                                    type="response")

predict.glm(model.h3.gender,
            newdata=proto.x.gen_pct,
            type="response",
            se.fit=TRUE)

proto.x.gen_pct$Gender = factor(proto.x.gen_pct$Gender, levels = c("Not Male", "Male"))

ggplot(data=proto.x.gen_pct) + aes(x=gender_pct, y=continued, group=individual,
                                  color=as.factor(presented.new)) +
    scale_color_discrete("Presented", breaks=c(0,1), labels=c("False", "True")) +
    scale_y_continuous("Proportion Continued") +
    scale_x_continuous("Proportion Attendees of Same Gender") +
    facet_grid(.~Gender) +
    geom_line() +
    geom_vline(xintercept = 0.2, linetype="dotted", color="black", size=1.0)


# Added vertical lines at 0.2 along the x-axes, to highlight the example of 
# predicted persistence with computing ("continued") if a participant attends 
# an event where 20% of the other attendees share the same gender as that 
# participant.